{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python3",
   "display_name": "DataAnalysis"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "%matplotlib inline"
   ]
  },
  {
   "source": [
    "# Chap4 编写结构化的程序\n",
    "1.  怎样才能写出结构良好，可读性强的程序，从而方便重用？\n",
    "2.  基本的结构块，例如：循环、函数和赋值是如何执行的？\n",
    "3.  Python 编程的陷阱还有哪些，如何避免它们？"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "## 4.4 结构化编程的基础(P156)"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Help on function get_text in module __main__:\n\nget_text(file)\n    Read text from a file, normalizing whitespace and stripping HTML markup.\n\ncontents=  Title\n"
     ]
    }
   ],
   "source": [
    "# Ex4-1: 从文件中读取文本\n",
    "import re\n",
    "\n",
    "\n",
    "def get_text(file):\n",
    "    \"\"\"Read text from a file, normalizing whitespace and stripping HTML markup.\"\"\"\n",
    "    text = open(file,encoding='utf-8').read()\n",
    "    text = re.sub(r'<.*?>', '', text)\n",
    "    text = re.sub('\\s+', '', text)\n",
    "    return text\n",
    "\n",
    "\n",
    "help(get_text)\n",
    "contents = get_text('test.html')\n",
    "print(\"contents= \",contents[:100])"
   ]
  },
  {
   "source": [
    "### 4.4.1 函数的输入与输出"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Monty Python Monty Python Monty Python\n"
     ]
    }
   ],
   "source": [
    "# 有参数的函数\n",
    "def repeat(msg, num):\n",
    "    return ' '.join([msg] * num)\n",
    "\n",
    "\n",
    "monty = 'Monty Python'\n",
    "print(repeat(monty, 3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "monty()=  Monty Python\nrepeat(monty(), 3)=  Monty Python Monty Python Monty Python\nrepeat('Monty Python', 3)=  Monty Python Monty Python Monty Python\n"
     ]
    }
   ],
   "source": [
    "# 无参数的函数\n",
    "def monty():\n",
    "    return 'Monty Python'\n",
    "\n",
    "\n",
    "print(\"monty()= \", monty())\n",
    "print(\"repeat(monty(), 3)= \", repeat(monty(), 3))\n",
    "print(\"repeat('Monty Python', 3)= \", repeat('Monty Python', 3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 没有返回值，调用者传入参数，就是请求函数帮助对输入参数排序\n",
    "def my_sort1(mylist):\n",
    "    mylist.sort()\n",
    "\n",
    "\n",
    "# 返回值是排序后的结果，传入的参数没有被改变\n",
    "def my_sort2(mylist):\n",
    "    return sorted(mylist)\n",
    "\n",
    "\n",
    "# 这个函数是危险的，因为输入值已经被修改，但是没有明确地通知调用者\n",
    "def my_sort3(mylist):\n",
    "    mylist.sort()\n",
    "    return mylist()"
   ]
  },
  {
   "source": [
    "### 4.4.2 参数传递(P159)"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "w=  \np=  []\nw=  \np=  ['noun']\n"
     ]
    }
   ],
   "source": [
    "# 第一个传入参数在函数内部被改变了，但是调用者的参数内容没有被改变，因为是按内容传值的\n",
    "# 第二个传入参数在函数内部被改变了，调用的参数内容也被改变了，因为是按地址传值的\n",
    "def set_up(word, properties):\n",
    "    word = 'lolcat'\n",
    "    properties.append('noun')\n",
    "    properties = 5\n",
    "\n",
    "\n",
    "# w没有被改变，p被改变了\n",
    "w = ''\n",
    "p = []\n",
    "print(\"w= \", w)\n",
    "print(\"p= \", p)\n",
    "set_up(w, p)\n",
    "print(\"w= \", w)\n",
    "print(\"p= \", p)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "w=  \n"
     ]
    }
   ],
   "source": [
    "# w没有被改变，因为 word 被两次赋值，没有指向 w 的地址\n",
    "w = ''\n",
    "word = w\n",
    "word = 'lolcat'\n",
    "print(\"w= \", w)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "p=  []\np=  ['noun']\nproperties=  ['noun']\np=  ['noun']\nproperties=  5\n"
     ]
    }
   ],
   "source": [
    "# p被改变了，因为 properties 没有被再次赋值，而是在原地址访问的链表内追加数据\n",
    "# 当 properties 被再次赋值时，p 已经被改变\n",
    "p = []\n",
    "properties = p\n",
    "print(\"p= \", p)\n",
    "properties.append('noun')\n",
    "print(\"p= \", p)\n",
    "print(\"properties= \", properties)\n",
    "properties = 5\n",
    "print(\"p= \", p)\n",
    "print(\"properties= \", properties)"
   ]
  },
  {
   "source": [
    "### 4.4.3 变量的作用域\n",
    "名称解析的LGB规则（本地（local）、全局（global）、内置（built-in））"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "### 4.4.4 参数类型检查(P160)"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "tag('the')=  det\ntag('knight')=  noun\ntag(['Tis', 'but', 'a', 'scratch'])=  noun\n"
     ]
    }
   ],
   "source": [
    "# 没有参数类型检查的函数\n",
    "def tag(word):\n",
    "    if word in ['a', 'the', 'all']:\n",
    "        return 'det'\n",
    "    else:\n",
    "        return 'noun'\n",
    "\n",
    "\n",
    "print(\"tag('the')= \", tag('the'))\n",
    "print(\"tag('knight')= \", tag('knight'))\n",
    "# 传入链表后，函数返回值是错误的\n",
    "print(\"tag(['Tis', 'but', 'a', 'scratch'])= \", tag(['Tis', 'but', 'a', 'scratch']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "tag('the')=  det\ntag('knight')=  noun\n"
     ]
    },
    {
     "output_type": "error",
     "ename": "AssertionError",
     "evalue": "argument to tag() must be a string",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-10-0f67bbfe4e44>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m     11\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"tag('knight')= \"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtag\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'knight'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     12\u001b[0m \u001b[1;31m# 传入链表后，函数断言失败\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 13\u001b[1;33m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"tag(['Tis', 'but', 'a', 'scratch'])= \"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtag\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'Tis'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'but'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'a'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'scratch'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32m<ipython-input-10-0f67bbfe4e44>\u001b[0m in \u001b[0;36mtag\u001b[1;34m(word)\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;31m# 使用断言进行参数类型检查\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mtag\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mword\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m     \u001b[1;32massert\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mword\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"argument to tag() must be a string\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      4\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0mword\u001b[0m \u001b[1;32min\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;34m'a'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'the'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'all'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[1;34m'det'\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mAssertionError\u001b[0m: argument to tag() must be a string"
     ]
    }
   ],
   "source": [
    "# 使用断言进行参数类型检查\n",
    "def tag(word):\n",
    "    assert isinstance(word, str), \"argument to tag() must be a string\"\n",
    "    if word in ['a', 'the', 'all']:\n",
    "        return 'det'\n",
    "    else:\n",
    "        return 'noun'\n",
    "\n",
    "\n",
    "print(\"tag('the')= \", tag('the'))\n",
    "print(\"tag('knight')= \", tag('knight'))\n",
    "# 传入链表后，函数断言失败\n",
    "print(\"tag(['Tis', 'but', 'a', 'scratch'])= \", tag(['Tis', 'but', 'a', 'scratch']))"
   ]
  },
  {
   "source": [
    "### 4.4.5 功能分解"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 使用函数提高程序的可读性和可维护性\n",
    "def load_corpus():\n",
    "    return -1\n",
    "\n",
    "\n",
    "def analyze(data):\n",
    "    return -1\n",
    "\n",
    "\n",
    "def present(results):\n",
    "    return\n",
    "\n",
    "\n",
    "data = load_corpus()\n",
    "results = analyze(data)\n",
    "present(results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "from urllib import request\n",
    "from bs4 import BeautifulSoup\n",
    "\n",
    "constitution = \"http://www.archives.gov/exhibits/charters/constitution_transcript.html\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "['the', 'of', 'archives', ',', 'and', 'national', '.', 'documents', 'constitution', 'founding', 'to', 'declaration', 'a', 'visit', 'for', 'online', 'freedom', \"'s\", '·', 'states', 'rights', 'charters', 'america', 'us', 'independence', 'united', 'or', 'home', 'resources', 'explore']\n"
     ]
    }
   ],
   "source": [
    "# Ex4-2 计算高频词的拙劣函数，存在的几个问题：\n",
    "# 1）修改了第二个参数的内容\n",
    "# 2）输出了已经计算过的结果\n",
    "def freq_words(url, freqdist, n):\n",
    "    html = request.urlopen(url).read().decode('utf8')\n",
    "    text = BeautifulSoup(html, 'html.parser').get_text()\n",
    "    for word in nltk.word_tokenize(text):\n",
    "        freqdist[word.lower()] += 1\n",
    "    result = []\n",
    "    for word, count in freqdist.most_common(n):\n",
    "        result += [word]\n",
    "    print(result)\n",
    "\n",
    "\n",
    "fd = nltk.FreqDist()\n",
    "freq_words(constitution, fd, 30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "['the', 'of', 'archives', ',', 'and', 'national', '.', 'documents', 'constitution', 'founding', 'to', 'declaration', 'a', 'visit', 'for', 'online', 'freedom', \"'s\", '·', 'states', 'rights', 'charters', 'america', 'us', 'independence', 'united', 'or', 'home', 'resources', 'explore']\n"
     ]
    }
   ],
   "source": [
    "# 重构Ex4-2该函数，得到Ex4-3 用来计算高频词的函数\n",
    "def freq_words(url, n):\n",
    "    html = request.urlopen(url).read().decode('utf8')\n",
    "    text = BeautifulSoup(html, 'html.parser').get_text()\n",
    "    fd = nltk.FreqDist(\n",
    "            word.lower()\n",
    "            for word in nltk.word_tokenize(text)\n",
    "    )\n",
    "    return [word for (word, _) in fd.most_common(n)]\n",
    "\n",
    "\n",
    "print(freq_words(constitution, 30))"
   ]
  },
  {
   "source": [
    "### 4.4.6 文档说明函数(P163)"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Help on function accuracy in module __main__:\n\naccuracy(reference, test)\n    Calculate the fraction of test items that equal the corresponding reference items.\n    \n    Given a list of reference values and a corresponding list of test values,\n    return the fraction of corresponding values that are equal.\n    In particular, return the fraction of indexes\n    {0<i<=len(test)} such that C{test[i] == reference[i]}.\n    \n        >>> accuracy(['ADJ', 'N', 'V', 'N'], ['N', 'N', 'V', 'ADJ'])\n        0.5\n    \n    :param reference: An ordered list of reference values\n    :type reference: list\n    :param test: A list of values to compare against the corresponding\n        reference values\n    :type test: list\n    :return: the accuracy score\n    :rtype: float\n    :raises ValueError: If reference and length do not have the same length\n\n"
     ]
    }
   ],
   "source": [
    "# Ex4-4: 完整的 docstring，\n",
    "# 包括总结、详细的解释、doctest的例子以及特定的参数、类型、返回值类型和异常标记\n",
    "def accuracy(reference, test):\n",
    "    \"\"\"\n",
    "    Calculate the fraction of test items that equal the corresponding reference items.\n",
    "\n",
    "    Given a list of reference values and a corresponding list of test values,\n",
    "    return the fraction of corresponding values that are equal.\n",
    "    In particular, return the fraction of indexes\n",
    "    {0<i<=len(test)} such that C{test[i] == reference[i]}.\n",
    "\n",
    "        >>> accuracy(['ADJ', 'N', 'V', 'N'], ['N', 'N', 'V', 'ADJ'])\n",
    "        0.5\n",
    "\n",
    "    :param reference: An ordered list of reference values\n",
    "    :type reference: list\n",
    "    :param test: A list of values to compare against the corresponding\n",
    "        reference values\n",
    "    :type test: list\n",
    "    :return: the accuracy score\n",
    "    :rtype: float\n",
    "    :raises ValueError: If reference and length do not have the same length\n",
    "    \"\"\"\n",
    "\n",
    "    if len(reference) != len(test):\n",
    "        raise ValueError(\"Lists must have the same length.\")\n",
    "    num_correct = 0\n",
    "    for x, y in zip(reference, test):\n",
    "        if x == y:\n",
    "            num_correct += 1\n",
    "    return float(num_correct) / len(reference)\n",
    "\n",
    "help(accuracy)"
   ]
  }
 ]
}